library(rio)
## Warning: package 'rio' was built under R version 4.4.3
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(viridis)
## Loading required package: viridisLite
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
##
## viridis_pal
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:scales':
##
## discard
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:rio':
##
## export
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.4.3
library(clustertend)
## Package `clustertend` is deprecated. Use package `hopkins` instead.
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# install.packages("ROSE")
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.4.3
## Loaded ROSE 0.0-4
Determinamos que la variable respuesta para nuestra investigación sería el ratio que describimos con anterioridad en el análisis exploratorio entre la edad de la víctima y el agresor. La cual se encuentra definida como:
\[ q=\frac{edad\ victima}{edad\ agresor} \]
Este nuevo indicador “q” lo dividimos en 3 categorías, según qué tanta diferencia de edad se encontró entre víctimas y agresores. Las categrías son las siguientes:
Primero, tenemos que unir todos los datasets de los años anteriores, entonces
original <- import("Datos/2023.sav")
addToDataset <- function(year) {
data <- import(paste("Datos/", year, ".sav", sep=""))
# reduced <- data[,colnames(original)]
original <<- bind_rows(original, data)
}
ifValueConvertToNA <- function(column, values) {
# print(paste("Removing ignored values from:", column))
original[,c(column)] <<- ifelse(original[,c(column)] %in% values, NA, original[,c(column)])
}
for (year in 2013:2022) {
addToDataset(year)
}
ignoredValues <- c(9, 99, 999, 9999)
affectedColumns <- c(
"VIC_EDAD",
"TOTAL_HIJOS",
"NUM_HIJ_HOM",
"NUM_HIJ_MUJ",
"VIC_ALFAB",
"VIC_ESCOLARIDAD",
"VIC_EST_CIV",
"VIC_GRUPET",
"VIC_NACIONAL",
"VIC_TRABAJA",
"VIC_OCUP",
"VIC_DEDICA",
"VIC_DISC",
"TIPO_DISCAQ",
"OTRAS_VICTIMAS",
"VIC_OTRAS_HOM",
"VIC_OTRAS_MUJ",
"VIC_OTRAS_N_OS",
"VIC_OTRAS_N_AS",
"HEC_DIA",
"HEC_MES",
"HEC_ANO",
"HEC_DEPTO",
"HEC_DEPTOMCPIO",
"HEC_AREA",
"HEC_RECUR_DENUN",
"INST_DONDE_DENUNCIO",
"AGR_EDAD",
"AGR_ALFAB",
"AGR_ESCOLARIDAD",
"AGR_EST_CIV",
"AGR_GURPET",
"AGR_NACIONAL",
"AGR_TRABAJA",
"AGR_OCUP",
"AGR_DEDICA",
"AGRESORES_OTROS_TOTAL",
"AGR_OTROS_HOM",
"AGR_OTRAS_MUJ",
"AGR_OTROS_N_OS",
"AGR_OTRAS_N_AS",
"CONDUCENTE",
"LEY_APLICABLE",
"ARTICULOVIF1",
"ARTICULOVIF2",
"ARTICULOVIF3",
"ARTICULOVIF4",
"ARTICULOVCM1",
"ARTICULOVCM2",
"ARTICULOVCM3",
"ARTICULOVCM4",
"ARTICULOCODPEN1",
"ARTICULOCODPEN2",
"ARTICULOCODPEN3",
"ARTICULOCODPEN4",
"ARTICULOTRAS1",
"ARTICULOTRAS2",
"ARTICULOTRAS3",
"ARTICULOTRAS4",
"MEDIDAS_SEGURIDAD",
"ORGANISMO_REMITE",
"QUIEN_REPORTA",
"ORGANISMO_JURISDICCIONAL"
)
for (col in affectedColumns) {
ifValueConvertToNA(col, ignoredValues)
}
# Ignorar también TIPO_MEDIDA, se ignora con valor z
ifValueConvertToNA("TIPO_MEDIDA", c("z"))
# Por alguna razón se crea esta columna, todos sus valores son NAN así que la borramos.
original$`filter_$` <- NULL
summary(original)
## HEC_DIA HEC_MES HEC_ANO HEC_DEPTO
## Min. : 1.00 Min. : 1.00 Min. :2000 Min. : 1.0
## 1st Qu.: 7.00 1st Qu.: 3.00 1st Qu.:2015 1st Qu.: 1.0
## Median :15.00 Median : 6.00 Median :2018 Median :10.0
## Mean :15.33 Mean : 6.19 Mean :2018 Mean : 8.9
## 3rd Qu.:23.00 3rd Qu.:10.00 3rd Qu.:2021 3rd Qu.:16.0
## Max. :31.00 Max. :12.00 Max. :2023 Max. :22.0
## NA's :16084 NA's :33585 NA's :4170 NA's :330334
## HEC_DEPTOMCPIO HEC_TIPAGRE NUMERO_BOLETA DIA_EMISION
## Min. : 101.0 Min. :1111 Min. : 0 Min. : 1.00
## 1st Qu.: 311.0 1st Qu.:1122 1st Qu.: 40 1st Qu.: 8.00
## Median :1003.0 Median :1222 Median : 95 Median :15.00
## Mean : 961.3 Mean :1603 Mean : 1057 Mean :15.32
## 3rd Qu.:1601.0 3rd Qu.:2122 3rd Qu.: 363 3rd Qu.:23.00
## Max. :2217.0 Max. :2221 Max. :17020 Max. :31.00
## NA's :1859 NA's :254152
## MES_EMISION ANO_EMISION DEPTO DEPTO_MCPIO
## Min. : 1.000 Min. :2013 Min. : 1.0 Min. : 101.0
## 1st Qu.: 4.000 1st Qu.:2015 1st Qu.: 1.0 1st Qu.: 309.0
## Median : 6.000 Median :2018 Median : 9.0 Median :1003.0
## Mean : 6.421 Mean :2018 Mean : 8.7 Mean : 958.3
## 3rd Qu.: 9.000 3rd Qu.:2021 3rd Qu.:15.0 3rd Qu.:1601.0
## Max. :12.000 Max. :2023 Max. :22.0 Max. :2217.0
## NA's :327781
## QUIEN_REPORTA VIC_SEXO VIC_EDAD TOTAL_HIJOS
## Min. :1.000 Min. :1.000 Min. : 1.00 Min. : 0.00
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:24.00 1st Qu.: 1.00
## Median :1.000 Median :2.000 Median :31.00 Median : 2.00
## Mean :1.031 Mean :1.878 Mean :33.63 Mean : 2.08
## 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:40.00 3rd Qu.: 3.00
## Max. :3.000 Max. :2.000 Max. :98.00 Max. :19.00
## NA's :4362 NA's :5635 NA's :75236
## NUM_HIJ_HOM NUM_HIJ_MUJ VIC_ALFAB VIC_ESCOLARIDAD
## Min. : 0.00 Min. : 0.00 Min. :1.000 Min. :10.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:1.000 1st Qu.:23.0
## Median : 1.00 Median : 1.00 Median :1.000 Median :29.0
## Mean : 1.08 Mean : 1.01 Mean :1.163 Mean :29.7
## 3rd Qu.: 2.00 3rd Qu.: 2.00 3rd Qu.:1.000 3rd Qu.:39.0
## Max. :14.00 Max. :14.00 Max. :2.000 Max. :59.0
## NA's :74409 NA's :74364 NA's :3326 NA's :12268
## VIC_EST_CIV VIC_GRUPET VIC_NACIONAL VIC_TRABAJA
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:2.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00
## Median :2.00 Median :1.000 Median :1.000 Median :2.00
## Mean :2.28 Mean :1.921 Mean :1.005 Mean :1.66
## 3rd Qu.:3.00 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.00
## Max. :5.00 Max. :6.000 Max. :2.000 Max. :2.00
## NA's :71927 NA's :5478 NA's :2488 NA's :2645
## VIC_OCUP VIC_DEDICA VIC_DISC TIPO_DISCAQ
## Min. : 110 Min. :1.0 Min. :1.000 Min. :1.0
## 1st Qu.:5142 1st Qu.:1.0 1st Qu.:2.000 1st Qu.:2.0
## Median :5311 Median :1.0 Median :2.000 Median :3.0
## Mean :6258 Mean :1.1 Mean :1.992 Mean :3.3
## 3rd Qu.:9111 3rd Qu.:1.0 3rd Qu.:2.000 3rd Qu.:6.0
## Max. :9998 Max. :6.0 Max. :2.000 Max. :6.0
## NA's :245216 NA's :129152 NA's :16136 NA's :363397
## VIC_REL_AGR OTRAS_VICTIMAS VIC_OTRAS_HOM VIC_OTRAS_MUJ
## Min. : 1.000 Min. : 0.00 Min. :0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 0.00 1st Qu.:0.00 1st Qu.: 0.00
## Median : 2.000 Median : 0.00 Median :0.00 Median : 0.00
## Mean : 3.446 Mean : 0.85 Mean :0.09 Mean : 0.15
## 3rd Qu.: 4.000 3rd Qu.: 1.00 3rd Qu.:0.00 3rd Qu.: 0.00
## Max. :10.000 Max. :19.00 Max. :8.00 Max. :14.00
## NA's :144298 NA's :144107 NA's :144110
## VIC_OTRAS_N_OS VIC_OTRAS_N_AS HEC_AREA HEC_RECUR_DENUN
## Min. : 0.00 Min. :0.0 Min. :1.000 Min. :1.000
## 1st Qu.: 0.00 1st Qu.:0.0 1st Qu.:1.000 1st Qu.:2.000
## Median : 0.00 Median :0.0 Median :1.000 Median :2.000
## Mean : 0.32 Mean :0.3 Mean :1.426 Mean :1.884
## 3rd Qu.: 0.00 3rd Qu.:0.0 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :11.00 Max. :8.0 Max. :2.000 Max. :2.000
## NA's :144109 NA's :144106 NA's :12853 NA's :13702
## INST_DONDE_DENUNCIO AGR_SEXO AGR_EDAD AGR_ALFAB
## Min. :1.0 Min. :1.000 Min. : 7.00 Min. :1.000
## 1st Qu.:3.0 1st Qu.:1.000 1st Qu.:26.00 1st Qu.:1.000
## Median :4.0 Median :1.000 Median :33.00 Median :1.000
## Mean :3.1 Mean :1.152 Mean :34.56 Mean :1.121
## 3rd Qu.:4.0 3rd Qu.:1.000 3rd Qu.:40.00 3rd Qu.:1.000
## Max. :6.0 Max. :2.000 Max. :98.00 Max. :2.000
## NA's :329334 NA's :24014 NA's :9183
## AGR_ESCOLARIDAD AGR_EST_CIV AGR_GURPET AGR_NACIONAL
## Min. :10.00 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:24.00 1st Qu.:2.00 1st Qu.:1.000 1st Qu.:1.000
## Median :29.00 Median :2.00 Median :1.000 Median :1.000
## Mean :30.26 Mean :2.28 Mean :1.941 Mean :1.004
## 3rd Qu.:39.00 3rd Qu.:3.00 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :59.00 Max. :5.00 Max. :6.000 Max. :2.000
## NA's :21433 NA's :72192 NA's :6834 NA's :10559
## AGR_TRABAJA AGR_OCUP AGR_DEDICA AGRESORES_OTROS_TOTAL
## Min. :1.000 Min. : 110 Min. :1.00 Min. : 0.00
## 1st Qu.:1.000 1st Qu.:5414 1st Qu.:1.00 1st Qu.: 0.00
## Median :1.000 Median :6111 Median :1.00 Median : 0.00
## Mean :1.207 Mean :6890 Mean :1.82 Mean : 0.21
## 3rd Qu.:1.000 3rd Qu.:9111 3rd Qu.:3.00 3rd Qu.: 0.00
## Max. :2.000 Max. :9998 Max. :6.00 Max. :15.00
## NA's :14964 NA's :100602 NA's :304722 NA's :187468
## AGR_OTROS_HOM AGR_OTRAS_MUJ AGR_OTROS_N_OS AGR_OTRAS_N_AS
## Min. :0.00 Min. :0.0 Min. :0.00 Min. :0.00
## 1st Qu.:0.00 1st Qu.:0.0 1st Qu.:0.00 1st Qu.:0.00
## Median :0.00 Median :0.0 Median :0.00 Median :0.00
## Mean :0.07 Mean :0.1 Mean :0.02 Mean :0.01
## 3rd Qu.:0.00 3rd Qu.:0.0 3rd Qu.:0.00 3rd Qu.:0.00
## Max. :8.00 Max. :8.0 Max. :7.00 Max. :6.00
## NA's :187462 NA's :187462 NA's :187461 NA's :187461
## INST_DENUN_HECHO ORGANISMO_JURISDICCIONAL CONDUCENTE LEY_APLICABLE
## Min. :1.000 Min. : 1.00 Min. :1.00 Min. :1.00
## 1st Qu.:3.000 1st Qu.: 1.00 1st Qu.:1.00 1st Qu.:1.00
## Median :4.000 Median : 1.00 Median :1.00 Median :1.00
## Mean :3.443 Mean : 4.73 Mean :1.38 Mean :1.74
## 3rd Qu.:4.000 3rd Qu.: 7.00 3rd Qu.:2.00 3rd Qu.:3.00
## Max. :6.000 Max. :16.00 Max. :2.00 Max. :6.00
## NA's :240778 NA's :249954 NA's :170923
## ARTICULOVIF1 ARTICULOVIF2 ARTICULOVIF3 ARTICULOVIF4
## Min. : 1.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 7.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 7.00 Median : 0.00 Median : 0.00 Median : 0.00
## Mean : 6.71 Mean : 0.35 Mean : 0.24 Mean : 0.26
## 3rd Qu.: 7.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :10.00 Max. :15.00 Max. :17.00 Max. :16.00
## NA's :244073 NA's :244881 NA's :244922 NA's :244164
## ARTICULOVCM1 ARTICULOVCM2 ARTICULOVCM3 ARTICULOVCM4
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 7.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 7.0 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 6.6 Mean : 0.6 Mean : 0.2 Mean : 0.3
## 3rd Qu.: 7.0 3rd Qu.: 0.0 3rd Qu.: 0.0 3rd Qu.: 0.0
## Max. :25.0 Max. :25.0 Max. :25.0 Max. :25.0
## NA's :328046 NA's :325569 NA's :324914 NA's :324774
## ARTICULOCODPEN1 ARTICULOCODPEN2 ARTICULOCODPEN3 ARTICULOCODPEN4
## Min. : 1.0 Min. : 0 Min. : 0.0 Min. : 0.0
## 1st Qu.:203.0 1st Qu.: 0 1st Qu.: 0.0 1st Qu.: 0.0
## Median :215.0 Median : 0 Median : 0.0 Median : 0.0
## Mean :312.2 Mean : 29 Mean : 1.4 Mean : 0.5
## 3rd Qu.:482.0 3rd Qu.: 0 3rd Qu.: 0.0 3rd Qu.: 0.0
## Max. :495.0 Max. :494 Max. :257.0 Max. :205.0
## NA's :364280 NA's :364280 NA's :364280 NA's :364280
## ARTICULOTRAS1 ARTICULOTRAS2 ARTICULOTRAS3 ARTICULOTRAS4
## Min. :141.0 Min. : 0.0 Min. :0 Min. :0
## 1st Qu.:141.0 1st Qu.:142.0 1st Qu.:0 1st Qu.:0
## Median :141.0 Median :142.0 Median :0 Median :0
## Mean :147.4 Mean :113.6 Mean :0 Mean :0
## 3rd Qu.:141.0 3rd Qu.:142.0 3rd Qu.:0 3rd Qu.:0
## Max. :173.0 Max. :142.0 Max. :0 Max. :0
## NA's :365124 NA's :365124 NA's :365124 NA's :365124
## MEDIDAS_SEGURIDAD TIPO_MEDIDA ORGANISMO_REMITE
## Min. :1 Length:365129 Min. : 1.00
## 1st Qu.:1 Class :character 1st Qu.:17.00
## Median :1 Mode :character Median :17.00
## Mean :1 Mean :15.71
## 3rd Qu.:1 3rd Qu.:18.00
## Max. :2 Max. :19.00
## NA's :171957 NA's :277781
Ahora creamos la variable respuesta:
edad_agr_vic <- original %>%
filter(!is.na(VIC_EDAD) & !is.na(AGR_EDAD)) %>%
mutate(
vicRatioAgr = VIC_EDAD / AGR_EDAD,
diferenciaEdad = ifelse(vicRatioAgr <= 0.8, "Mucho menor", ifelse(vicRatioAgr <= 1.2, "Similar", "Mucho mayor"))
)
summary(edad_agr_vic)
## HEC_DIA HEC_MES HEC_ANO HEC_DEPTO
## Min. : 1.00 Min. : 1.000 Min. :2000 Min. : 1.00
## 1st Qu.: 7.00 1st Qu.: 3.000 1st Qu.:2015 1st Qu.: 2.00
## Median :15.00 Median : 6.000 Median :2018 Median :10.00
## Mean :15.31 Mean : 6.182 Mean :2018 Mean : 9.11
## 3rd Qu.:23.00 3rd Qu.:10.000 3rd Qu.:2021 3rd Qu.:16.00
## Max. :31.00 Max. :12.000 Max. :2023 Max. :22.00
## NA's :14243 NA's :30375 NA's :3242 NA's :306113
## HEC_DEPTOMCPIO HEC_TIPAGRE NUMERO_BOLETA DIA_EMISION
## Min. : 101.0 Min. :1111 Min. : 0 Min. : 1.00
## 1st Qu.: 312.0 1st Qu.:1122 1st Qu.: 39 1st Qu.: 8.00
## Median :1004.0 Median :1222 Median : 93 Median :15.00
## Mean : 963.8 Mean :1600 Mean : 1122 Mean :15.31
## 3rd Qu.:1601.0 3rd Qu.:2122 3rd Qu.: 357 3rd Qu.:23.00
## Max. :2217.0 Max. :2221 Max. :17020 Max. :31.00
## NA's :1241 NA's :236858
## MES_EMISION ANO_EMISION DEPTO DEPTO_MCPIO
## Min. : 1.000 Min. :2013 Min. : 1.00 Min. : 101.0
## 1st Qu.: 4.000 1st Qu.:2015 1st Qu.: 2.00 1st Qu.: 312.0
## Median : 6.000 Median :2018 Median : 9.00 Median :1004.0
## Mean : 6.414 Mean :2018 Mean : 8.99 Mean : 961.8
## 3rd Qu.: 9.000 3rd Qu.:2021 3rd Qu.:15.00 3rd Qu.:1601.0
## Max. :12.000 Max. :2023 Max. :22.00 Max. :2217.0
## NA's :303902
## QUIEN_REPORTA VIC_SEXO VIC_EDAD TOTAL_HIJOS
## Min. :1.000 Min. :1.000 Min. : 1.00 Min. : 0.00
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:24.00 1st Qu.: 1.00
## Median :1.000 Median :2.000 Median :31.00 Median : 2.00
## Mean :1.029 Mean :1.878 Mean :33.56 Mean : 2.09
## 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:40.00 3rd Qu.: 3.00
## Max. :3.000 Max. :2.000 Max. :98.00 Max. :19.00
## NA's :3679 NA's :62215
## NUM_HIJ_HOM NUM_HIJ_MUJ VIC_ALFAB VIC_ESCOLARIDAD
## Min. : 0.00 Min. : 0.00 Min. :1.000 Min. :10.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:1.000 1st Qu.:23.00
## Median : 1.00 Median : 1.00 Median :1.000 Median :29.00
## Mean : 1.09 Mean : 1.02 Mean :1.162 Mean :29.72
## 3rd Qu.: 2.00 3rd Qu.: 2.00 3rd Qu.:1.000 3rd Qu.:39.00
## Max. :14.00 Max. :14.00 Max. :2.000 Max. :59.00
## NA's :61418 NA's :61381 NA's :2068 NA's :8208
## VIC_EST_CIV VIC_GRUPET VIC_NACIONAL VIC_TRABAJA
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.00 Median :1.000 Median :1.000 Median :2.000
## Mean :2.29 Mean :1.901 Mean :1.005 Mean :1.657
## 3rd Qu.:3.00 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :5.00 Max. :6.000 Max. :2.000 Max. :2.000
## NA's :65122 NA's :4350 NA's :1965 NA's :1815
## VIC_OCUP VIC_DEDICA VIC_DISC TIPO_DISCAQ
## Min. : 110 Min. :1.0 Min. :1.000 Min. :1.0
## 1st Qu.:5142 1st Qu.:1.0 1st Qu.:2.000 1st Qu.:2.0
## Median :5249 Median :1.0 Median :2.000 Median :3.0
## Mean :6255 Mean :1.1 Mean :1.992 Mean :3.3
## 3rd Qu.:9111 3rd Qu.:1.0 3rd Qu.:2.000 3rd Qu.:5.0
## Max. :9998 Max. :6.0 Max. :2.000 Max. :6.0
## NA's :225247 NA's :119518 NA's :13392 NA's :335976
## VIC_REL_AGR OTRAS_VICTIMAS VIC_OTRAS_HOM VIC_OTRAS_MUJ
## Min. : 1.000 Min. : 0.00 Min. :0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 0.00 1st Qu.:0.00 1st Qu.: 0.00
## Median : 2.000 Median : 0.00 Median :0.00 Median : 0.00
## Mean : 3.399 Mean : 0.85 Mean :0.09 Mean : 0.15
## 3rd Qu.: 3.000 3rd Qu.: 1.00 3rd Qu.:0.00 3rd Qu.: 0.00
## Max. :10.000 Max. :19.00 Max. :8.00 Max. :14.00
## NA's :127533 NA's :127348 NA's :127351
## VIC_OTRAS_N_OS VIC_OTRAS_N_AS HEC_AREA HEC_RECUR_DENUN
## Min. : 0.00 Min. :0.0 Min. :1.000 Min. :1.000
## 1st Qu.: 0.00 1st Qu.:0.0 1st Qu.:1.000 1st Qu.:2.000
## Median : 0.00 Median :0.0 Median :1.000 Median :2.000
## Mean : 0.32 Mean :0.3 Mean :1.428 Mean :1.884
## 3rd Qu.: 0.00 3rd Qu.:0.0 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :11.00 Max. :8.0 Max. :2.000 Max. :2.000
## NA's :127350 NA's :127347 NA's :10560 NA's :11185
## INST_DONDE_DENUNCIO AGR_SEXO AGR_EDAD AGR_ALFAB
## Min. :1.00 Min. :1.000 Min. : 7.00 Min. :1.000
## 1st Qu.:3.00 1st Qu.:1.000 1st Qu.:26.00 1st Qu.:1.000
## Median :4.00 Median :1.000 Median :33.00 Median :1.000
## Mean :3.11 Mean :1.151 Mean :34.55 Mean :1.118
## 3rd Qu.:4.00 3rd Qu.:1.000 3rd Qu.:40.00 3rd Qu.:1.000
## Max. :6.00 Max. :2.000 Max. :98.00 Max. :2.000
## NA's :303926 NA's :3680
## AGR_ESCOLARIDAD AGR_EST_CIV AGR_GURPET AGR_NACIONAL AGR_TRABAJA
## Min. :10.00 Min. :1.00 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:24.00 1st Qu.:2.00 1st Qu.:1.00 1st Qu.:1.000 1st Qu.:1.000
## Median :29.00 Median :2.00 Median :1.00 Median :1.000 Median :1.000
## Mean :30.33 Mean :2.28 Mean :1.92 Mean :1.004 Mean :1.203
## 3rd Qu.:39.00 3rd Qu.:3.00 3rd Qu.:2.00 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :59.00 Max. :5.00 Max. :6.00 Max. :2.000 Max. :2.000
## NA's :12841 NA's :63726 NA's :5363 NA's :8525 NA's :8017
## AGR_OCUP AGR_DEDICA AGRESORES_OTROS_TOTAL AGR_OTROS_HOM
## Min. : 110 Min. :1.00 Min. : 0.0 Min. :0.00
## 1st Qu.:5414 1st Qu.:1.00 1st Qu.: 0.0 1st Qu.:0.00
## Median :6111 Median :1.00 Median : 0.0 Median :0.00
## Mean :6876 Mean :1.84 Mean : 0.2 Mean :0.07
## 3rd Qu.:9111 3rd Qu.:3.00 3rd Qu.: 0.0 3rd Qu.:0.00
## Max. :9998 Max. :6.00 Max. :15.0 Max. :8.00
## NA's :85227 NA's :280879 NA's :167456 NA's :167452
## AGR_OTRAS_MUJ AGR_OTROS_N_OS AGR_OTRAS_N_AS INST_DENUN_HECHO
## Min. :0.0 Min. :0.00 Min. :0.00 Min. :1.000
## 1st Qu.:0.0 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:3.000
## Median :0.0 Median :0.00 Median :0.00 Median :4.000
## Mean :0.1 Mean :0.02 Mean :0.01 Mean :3.439
## 3rd Qu.:0.0 3rd Qu.:0.00 3rd Qu.:0.00 3rd Qu.:4.000
## Max. :8.0 Max. :7.00 Max. :6.00 Max. :6.000
## NA's :167451 NA's :167451 NA's :167451
## ORGANISMO_JURISDICCIONAL CONDUCENTE LEY_APLICABLE ARTICULOVIF1
## Min. : 1.00 Min. :1.00 Min. :1.00 Min. : 1.0
## 1st Qu.: 1.00 1st Qu.:1.00 1st Qu.:1.00 1st Qu.: 7.0
## Median : 1.00 Median :1.00 Median :1.00 Median : 7.0
## Mean : 4.69 Mean :1.37 Mean :1.75 Mean : 6.7
## 3rd Qu.: 7.00 3rd Qu.:2.00 3rd Qu.:3.00 3rd Qu.: 7.0
## Max. :16.00 Max. :2.00 Max. :6.00 Max. :10.0
## NA's :225602 NA's :233203 NA's :159784 NA's :228391
## ARTICULOVIF2 ARTICULOVIF3 ARTICULOVIF4 ARTICULOVCM1
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 7.00
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 7.00
## Mean : 0.36 Mean : 0.24 Mean : 0.26 Mean : 6.57
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 7.00
## Max. :15.00 Max. :17.00 Max. :16.00 Max. :25.00
## NA's :229166 NA's :229184 NA's :228480 NA's :303468
## ARTICULOVCM2 ARTICULOVCM3 ARTICULOVCM4 ARTICULOCODPEN1
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 4
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:203
## Median : 0.00 Median : 0.00 Median : 0.00 Median :215
## Mean : 0.57 Mean : 0.21 Mean : 0.28 Mean :322
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.:483
## Max. :25.00 Max. :25.00 Max. :25.00 Max. :495
## NA's :301080 NA's :300434 NA's :300297 NA's :336796
## ARTICULOCODPEN2 ARTICULOCODPEN3 ARTICULOCODPEN4 ARTICULOTRAS1
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. :141.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.:141.0
## Median : 0.0 Median : 0.0 Median : 0.0 Median :141.0
## Mean : 29.8 Mean : 1.6 Mean : 0.6 Mean :147.4
## 3rd Qu.: 0.0 3rd Qu.: 0.0 3rd Qu.: 0.0 3rd Qu.:141.0
## Max. :494.0 Max. :257.0 Max. :205.0 Max. :173.0
## NA's :336796 NA's :336796 NA's :336796 NA's :337519
## ARTICULOTRAS2 ARTICULOTRAS3 ARTICULOTRAS4 MEDIDAS_SEGURIDAD
## Min. : 0.0 Min. :0 Min. :0 Min. :1
## 1st Qu.:142.0 1st Qu.:0 1st Qu.:0 1st Qu.:1
## Median :142.0 Median :0 Median :0 Median :1
## Mean :113.6 Mean :0 Mean :0 Mean :1
## 3rd Qu.:142.0 3rd Qu.:0 3rd Qu.:0 3rd Qu.:1
## Max. :142.0 Max. :0 Max. :0 Max. :2
## NA's :337519 NA's :337519 NA's :337519 NA's :160713
## TIPO_MEDIDA ORGANISMO_REMITE vicRatioAgr diferenciaEdad
## Length:337524 Min. : 1.00 Min. :0.0200 Length:337524
## Class :character 1st Qu.:17.00 1st Qu.:0.8276 Class :character
## Mode :character Median :17.00 Median :0.9375 Mode :character
## Mean :15.71 Mean :1.0165
## 3rd Qu.:18.00 3rd Qu.:1.0588
## Max. :19.00 Max. :9.0000
## NA's :257336
Con esto podemos decir que el dataset tiene 77 variables y 337524 observaciones.
Con lo cual decidimos dividir el dataset en 2 grupos, uno de validación y otro para entrenamiento, el de validación tiene el 30% de los datos mientras que el de entrenamiento el 70%. Los grupos se ven así:
set.seed(69420)
train_index <- createDataPartition(edad_agr_vic$diferenciaEdad, p = 0.7, list = FALSE)
train_data <- edad_agr_vic[train_index,]
test_data <- edad_agr_vic[-train_index,]
ggplotly(ggplot(train_data, aes(x=diferenciaEdad)) +
geom_bar(fill = "skyblue") +
labs(title="Datos de Entrenamiento", x = "Categoria", y = "Cuenta"))
ggplotly(ggplot(test_data, aes(x=diferenciaEdad)) +
geom_bar(fill = "orange") +
labs(title = "Datos de Validacion", x = "Categoria", y = "Cuenta"))
Como se puede ver existe una alta desigualdad en los datos, puesto que la gran mayoría de los casos se dan cuando la víctima tiene una edad similar al agresor. Definitivamente esto será algo a tomar en cuenta durante el entrenamiento del modelo, uno de las posibles optimizaciones a evaluar podría ser balancear la data de entrenamiento para mejorar la precisión.